Pembuatan Model Prediksi Hujan Menggunakan Metode K-Nearest Neighbors
Authors
Romeo
Ruslan
Hana
Haifan
Ana
Zahra
Import Libraries
Code
import numpy as np # Numerical Computationsimport pandas as pd # Data Preprocessing# Import libraries for plottingimport matplotlib.pyplot as pltimport seaborn as snsimport plotly.express as px import plotly.io as piofrom plotly.subplots import make_subplotsimport plotly.graph_objects as gopio.renderers.default ="plotly_mimetype+notebook_connected"
Import Dataset
Code
df = pd.read_csv('weatherAUS.csv')
Eksplorasi Data
Preview dataset
Code
df.head()
Date
Location
MinTemp
MaxTemp
Rainfall
Evaporation
Sunshine
WindGustDir
WindGustSpeed
WindDir9am
...
Humidity9am
Humidity3pm
Pressure9am
Pressure3pm
Cloud9am
Cloud3pm
Temp9am
Temp3pm
RainToday
RainTomorrow
0
2008-12-01
Albury
13.4
22.9
0.6
NaN
NaN
W
44.0
W
...
71.0
22.0
1007.7
1007.1
8.0
NaN
16.9
21.8
No
No
1
2008-12-02
Albury
7.4
25.1
0.0
NaN
NaN
WNW
44.0
NNW
...
44.0
25.0
1010.6
1007.8
NaN
NaN
17.2
24.3
No
No
2
2008-12-03
Albury
12.9
25.7
0.0
NaN
NaN
WSW
46.0
W
...
38.0
30.0
1007.6
1008.7
NaN
2.0
21.0
23.2
No
No
3
2008-12-04
Albury
9.2
28.0
0.0
NaN
NaN
NE
24.0
SE
...
45.0
16.0
1017.6
1012.8
NaN
NaN
18.1
26.5
No
No
4
2008-12-05
Albury
17.5
32.3
1.0
NaN
NaN
W
41.0
ENE
...
82.0
33.0
1010.8
1006.0
7.0
8.0
17.8
29.7
No
No
5 rows × 23 columns
Note
Variabel target adalah RainTomorrow.
View dimension of dataset
Code
df.shape
(145460, 23)
Bisa kita lihat bahwa ada 145460 baris dan 23 kolom yang terdapat di dalam dataset.
Terdapat 6 variabel kategorikal dalam dataset. Variabel-variabel tersebut adalah: Lokasi, WindGustDir, WindDir9am, WindDir3pm, RainToday, dan RainTomorrow.
Ada dua variabel kategorikal biner yaitu RainToday dan RainTomorrow
RainTomorrow adalah variabel target
Missing values in Categorical Variables
Code
categorical.isna().sum().to_frame('number of null values')
df_dateplot = df.iloc[-950:,:]plt.figure(figsize=[20,5])plt.plot(df_dateplot['Date'],df_dateplot['MinTemp'],color='blue',linewidth=1, label='MinTemp')plt.plot(df_dateplot['Date'],df_dateplot['MaxTemp'],color='red',linewidth=1, label='MaxTemp')plt.fill_between(df_dateplot['Date'],df_dateplot['MinTemp'],df_dateplot['MaxTemp'], facecolor ='#EBF78F')plt.title('MinTemp vs MaxTemp by Date')plt.legend(loc='lower left', frameon=False)plt.show()
Plot di atas menunjukkan bahwa temperatur minimal dan maksimal relatif meningkat dan menurun setiap tahunnya.
Kondisi cuaca selalu berlawanan di kedua bagian. Seperti Australia yang terletak di belahan bumi bagian selatan musim-musimnya sedikit berbeda.
Seperti yang dapat kita lihat bahwa dari Desember hingga Februari adalah musim panas, dari Maret hingga Mei adalah musim gugur, dari Juni hingga Agustus adalah musim dingin, dan dari September hingga November adalah musim semi.
Visualisasi Distribusi Variabel Numerik
def plot_numerical_distributions(df):# Get numerical columns numerical_cols = df.select_dtypes(include=['float64']).columns# Calculate number of rows and columns needed for subplots n_plots =len(numerical_cols) n_cols =4# keeping 4 columns as in original n_rows =int(np.ceil(n_plots / n_cols))# Create subplots fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=numerical_cols, vertical_spacing=0.1, horizontal_spacing=0.05)# Current position tracker row =1 col =1for col_name in numerical_cols:# Remove NaN values for this column clean_data = df[col_name].dropna()iflen(clean_data) >0: # Only create plot if we have non-NaN values# Create histogram with KDE hist = go.Histogram(x=clean_data, name=col_name, nbinsx=30, histnorm='probability density')# Calculate KDE only if we have enough data pointsiflen(clean_data) >1: kde_points = np.linspace(clean_data.min(), clean_data.max(), 100) kde = np.histogram(clean_data, bins=30, density=True)[0]# Add KDE line kde_line = go.Scatter(x=kde_points, y=np.interp(kde_points, np.linspace(clean_data.min(), clean_data.max(), 30), kde), name=f'{col_name}_kde', line=dict(color='red'))# Add traces to subplot fig.add_trace(hist, row=row, col=col) fig.add_trace(kde_line, row=row, col=col)else:# If not enough data points for KDE, just add histogram fig.add_trace(hist, row=row, col=col)# Update position for next plotif col == n_cols: col =1 row +=1else: col +=1# Update layout fig.update_layout( height=500* n_rows, # Adjust height based on number of rows width=1600, # Fixed width showlegend=False, title_text="Distribusi Variabel Numerik" )# Update axes labels fig.update_xaxes(title_text="Value") fig.update_yaxes(title_text="Density")return fignumeric_dist_fig = plot_numerical_distributions(df)numeric_dist_fig.show()